setwd("~/Dropbox/clado-manuscript/Mikes_MS_Data/")
#source('http://bioconductor.org/biocLite.R')
#biocLite('phyloseq')
library(phyloseq)
library(ggplot2)
library(plyr)
library(dplyr)
library(Rmisc)
library(DESeq2)
library(doParallel)
# Load biom file. 
biom <- import_biom("OTU_table.biom", "~/Dropbox/clado-manuscript/Nephele/PipelineResults_NMEPINZ20QK1/nephele_outputs/tree.tre", parseFunction=parse_taxonomy_greengenes)
sam.data <- read.csv(file="sample.data.csv", row.names=1, header=TRUE)
head(sam.data)
##        TreatmentGroup  Site Date                       Description
## C172N1          Early North  172 Sample of day 172 at site North 1
## C172N2          Early North  172 Sample of day 172 at site North 2
## C172N3          Early North  172 Sample of day 172 at site North 3
## C172P1          Early Point  172 Sample of day 172 at site Point 1
## C172P2          Early Point  172 Sample of day 172 at site Point 2
## C172P3          Early Point  172 Sample of day 172 at site Point 3
##        SampleID.1
## C172N1     C172N1
## C172N2     C172N2
## C172N3     C172N3
## C172P1     C172P1
## C172P2     C172P2
## C172P3     C172P3
sam.data$Date <- as.factor(sam.data$Date)
sam.data$DateSite <- paste(sam.data$Date, sam.data$Site)
head(sam.data); str(sam.data)
##        TreatmentGroup  Site Date                       Description
## C172N1          Early North  172 Sample of day 172 at site North 1
## C172N2          Early North  172 Sample of day 172 at site North 2
## C172N3          Early North  172 Sample of day 172 at site North 3
## C172P1          Early Point  172 Sample of day 172 at site Point 1
## C172P2          Early Point  172 Sample of day 172 at site Point 2
## C172P3          Early Point  172 Sample of day 172 at site Point 3
##        SampleID.1  DateSite
## C172N1     C172N1 172 North
## C172N2     C172N2 172 North
## C172N3     C172N3 172 North
## C172P1     C172P1 172 Point
## C172P2     C172P2 172 Point
## C172P3     C172P3 172 Point
## 'data.frame':    52 obs. of  6 variables:
##  $ TreatmentGroup: Factor w/ 2 levels "Early","Late": 1 1 1 1 1 1 1 1 1 1 ...
##  $ Site          : Factor w/ 3 levels "North","Point",..: 1 1 1 2 2 2 3 3 3 1 ...
##  $ Date          : Factor w/ 6 levels "172","178","185",..: 1 1 1 1 1 1 1 1 1 2 ...
##  $ Description   : Factor w/ 52 levels "Sample of day 172 at site North 1",..: 1 2 3 4 5 6 7 8 9 10 ...
##  $ SampleID.1    : Factor w/ 52 levels "C172N1","C172N2",..: 1 2 3 4 5 6 7 8 9 10 ...
##  $ DateSite      : chr  "172 North" "172 North" "172 North" "172 Point" ...
sample_data(biom) <- sam.data
biom; sample_data(biom)
## phyloseq-class experiment-level object
## otu_table()   OTU Table:         [ 51928 taxa and 52 samples ]
## sample_data() Sample Data:       [ 52 samples by 6 sample variables ]
## tax_table()   Taxonomy Table:    [ 51928 taxa by 8 taxonomic ranks ]
## phy_tree()    Phylogenetic Tree: [ 51928 tips and 51914 internal nodes ]
## Sample Data:        [52 samples by 6 sample variables]:
##        TreatmentGroup  Site Date                       Description
## C178N1          Early North  178 Sample of day 178 at site North 1
## C178P1          Early Point  178 Sample of day 178 at site Point 1
## C185P2          Early Point  185 Sample of day 185 at site Point 2
## C206N2           Late North  206 Sample of day 206 at site North 2
## C206P1           Late Point  206 Sample of day 206 at site Point 1
## C206P2           Late Point  206 Sample of day 206 at site Point 2
## C214P1           Late Point  214 Sample of day 214 at site Point 1
## C214P2           Late Point  214 Sample of day 214 at site Point 2
## C214P3           Late Point  214 Sample of day 214 at site Point 3
## C214S1           Late South  214 Sample of day 214 at site South 1
## C214S2           Late South  214 Sample of day 214 at site South 2
## C214S3           Late South  214 Sample of day 214 at site South 3
## C185P1          Early Point  185 Sample of day 185 at site Point 1
## C185P3          Early Point  185 Sample of day 185 at site Point 3
## C199P3           Late Point  199 Sample of day 199 at site Point 3
## C199S2           Late South  199 Sample of day 199 at site South 2
## C199S3           Late South  199 Sample of day 199 at site South 3
## C206N1           Late North  206 Sample of day 206 at site North 1
## C178P2          Early Point  178 Sample of day 178 at site Point 2
## C199N3           Late North  199 Sample of day 199 at site North 3
## C206S1           Late South  206 Sample of day 206 at site South 1
## C214N3           Late North  214 Sample of day 214 at site North 3
## C199N2           Late North  199 Sample of day 199 at site North 2
## C206N3           Late North  206 Sample of day 206 at site North 3
## C206S2           Late South  206 Sample of day 206 at site South 2
## C199N1           Late North  199 Sample of day 199 at site North 1
## C199P1           Late Point  199 Sample of day 199 at site Point 1
## C199S1           Late South  199 Sample of day 199 at site South 1
## C214N1           Late North  214 Sample of day 214 at site North 1
## C172P1          Early Point  172 Sample of day 172 at site Point 1
## C199P2           Late Point  199 Sample of day 199 at site Point 2
## C172N3          Early North  172 Sample of day 172 at site North 3
## C172S3          Early South  172 Sample of day 172 at site South 3
## C178S2          Early South  178 Sample of day 178 at site South 2
## C178P3          Early Point  178 Sample of day 178 at site Point 3
## C178S3          Early South  178 Sample of day 178 at site South 3
## C172N1          Early North  172 Sample of day 172 at site North 1
## C172S1          Early South  172 Sample of day 172 at site South 1
## C178N3          Early North  178 Sample of day 178 at site North 3
## C185N2          Early North  185 Sample of day 185 at site North 2
## C185N3          Early North  185 Sample of day 185 at site North 3
## C185S3          Early South  185 Sample of day 185 at site South 3
## C214N2           Late North  214 Sample of day 214 at site North 2
## C172P2          Early Point  172 Sample of day 172 at site Point 2
## C185S2          Early South  185 Sample of day 185 at site South 2
## C172P3          Early Point  172 Sample of day 172 at site Point 3
## C185N1          Early North  185 Sample of day 185 at site North 1
## C172N2          Early North  172 Sample of day 172 at site North 2
## C178S1          Early South  178 Sample of day 178 at site South 1
## C185S1          Early South  185 Sample of day 185 at site South 1
## C172S2          Early South  172 Sample of day 172 at site South 2
## C178N2          Early North  178 Sample of day 178 at site North 2
##        SampleID.1  DateSite
## C178N1     C178N1 178 North
## C178P1     C178P1 178 Point
## C185P2     C185P2 185 Point
## C206N2     C206N2 206 North
## C206P1     C206P1 206 Point
## C206P2     C206P2 206 Point
## C214P1     C214P1 214 Point
## C214P2     C214P2 214 Point
## C214P3     C214P3 214 Point
## C214S1     C214S1 214 South
## C214S2     C214S2 214 South
## C214S3     C214S3 214 South
## C185P1     C185P1 185 Point
## C185P3     C185P3 185 Point
## C199P3     C199P3 199 Point
## C199S2     C199S2 199 South
## C199S3     C199S3 199 South
## C206N1     C206N1 206 North
## C178P2     C178P2 178 Point
## C199N3     C199N3 199 North
## C206S1     C206S1 206 South
## C214N3     C214N3 214 North
## C199N2     C199N2 199 North
## C206N3     C206N3 206 North
## C206S2     C206S2 206 South
## C199N1     C199N1 199 North
## C199P1     C199P1 199 Point
## C199S1     C199S1 199 South
## C214N1     C214N1 214 North
## C172P1     C172P1 172 Point
## C199P2     C199P2 199 Point
## C172N3     C172N3 172 North
## C172S3     C172S3 172 South
## C178S2     C178S2 178 South
## C178P3     C178P3 178 Point
## C178S3     C178S3 178 South
## C172N1     C172N1 172 North
## C172S1     C172S1 172 South
## C178N3     C178N3 178 North
## C185N2     C185N2 185 North
## C185N3     C185N3 185 North
## C185S3     C185S3 185 South
## C214N2     C214N2 214 North
## C172P2     C172P2 172 Point
## C185S2     C185S2 185 South
## C172P3     C172P3 172 Point
## C185N1     C185N1 185 North
## C172N2     C172N2 172 North
## C178S1     C178S1 178 South
## C185S1     C185S1 185 South
## C172S2     C172S2 172 South
## C178N2     C178N2 178 North
head(otu_table(biom))
## OTU Table:          [6 taxa and 52 samples]
##                      taxa are rows
##                                C178N1 C178P1 C185P2 C206N2 C206P1 C206P2
## New.CleanUp.ReferenceOTU155901      0      0      0      0      0      1
## New.CleanUp.ReferenceOTU321320      0      0      0      0      0      0
## KC551585.1.1451                     2      0      0      6      0      4
## JQ945994.1.1399                     0      0      0      0      0      0
## EF653577.1.1339                     0      0      0      0      0      0
## JQ814729.1.1495                   137      8      1     73      8      6
##                                C214P1 C214P2 C214P3 C214S1 C214S2 C214S3
## New.CleanUp.ReferenceOTU155901      0      0      1      0      1      0
## New.CleanUp.ReferenceOTU321320      0      0      0      0      0      0
## KC551585.1.1451                     9      4     11      2      1      0
## JQ945994.1.1399                     1      0      0      1      0      0
## EF653577.1.1339                     0      0      0      4      0      0
## JQ814729.1.1495                    11      4     27     40      8     23
##                                C185P1 C185P3 C199P3 C199S2 C199S3 C206N1
## New.CleanUp.ReferenceOTU155901      0      0      0      0      0      0
## New.CleanUp.ReferenceOTU321320      0      1      0      0      0      0
## KC551585.1.1451                     3      2      5      0      0     11
## JQ945994.1.1399                     0      0      0      0      0      0
## EF653577.1.1339                     0      0      0      0      0      0
## JQ814729.1.1495                     6      4      2     11      2     56
##                                C178P2 C199N3 C206S1 C214N3 C199N2 C206N3
## New.CleanUp.ReferenceOTU155901      0      0      0      0      0      0
## New.CleanUp.ReferenceOTU321320      0      0      0      0      0      0
## KC551585.1.1451                     0      1      4      3      0      3
## JQ945994.1.1399                     0      0      2      0      0      0
## EF653577.1.1339                     0      0      0      0      0      0
## JQ814729.1.1495                    10     23      6     19      5    849
##                                C206S2 C199N1 C199P1 C199S1 C214N1 C172P1
## New.CleanUp.ReferenceOTU155901      0      0      0      0      0      0
## New.CleanUp.ReferenceOTU321320      0      0      0      0      0      0
## KC551585.1.1451                     1      0      8      2      5      0
## JQ945994.1.1399                     0      0      0      0      0      0
## EF653577.1.1339                     0      0      0      0      0      0
## JQ814729.1.1495                    14     24      5      5      3      2
##                                C199P2 C172N3 C172S3 C178S2 C178P3 C178S3
## New.CleanUp.ReferenceOTU155901      0      0      0      0      0      0
## New.CleanUp.ReferenceOTU321320      0      0      0      0      0      0
## KC551585.1.1451                     2      5      8      0      1      0
## JQ945994.1.1399                     0      0      0      0      0      0
## EF653577.1.1339                     0      0      0      0      0      0
## JQ814729.1.1495                     2    121     31      0      0      5
##                                C172N1 C172S1 C178N3 C185N2 C185N3 C185S3
## New.CleanUp.ReferenceOTU155901      0      0      0      0      0      0
## New.CleanUp.ReferenceOTU321320      0      0      0      0      0      0
## KC551585.1.1451                     0      0      1      0      0      0
## JQ945994.1.1399                     0      0      0      0      0      0
## EF653577.1.1339                     0      0      0      0      0      0
## JQ814729.1.1495                     5     16      4      6     27      4
##                                C214N2 C172P2 C185S2 C172P3 C185N1 C172N2
## New.CleanUp.ReferenceOTU155901      0      0      0      0      0      0
## New.CleanUp.ReferenceOTU321320      0      0      1      0      0      0
## KC551585.1.1451                     3      2      0      2      1      6
## JQ945994.1.1399                     0      0      0      0      0      0
## EF653577.1.1339                     0      0      0      0      0      0
## JQ814729.1.1495                     4     13      1      3     22     21
##                                C178S1 C185S1 C172S2 C178N2
## New.CleanUp.ReferenceOTU155901      0      0      0      0
## New.CleanUp.ReferenceOTU321320      0      0      0      0
## KC551585.1.1451                     0      1      0      3
## JQ945994.1.1399                     0      0      3      0
## EF653577.1.1339                     0      0      0      0
## JQ814729.1.1495                     2      3     21     18
sample.summary <- data.frame(sample_data(biom)) %>%
    group_by(Site,Date) %>%
    summarize(total=n())
sample.summary
## Source: local data frame [18 x 3]
## Groups: Site [?]
## 
##      Site   Date total
##    <fctr> <fctr> <int>
## 1   North    172     3
## 2   North    178     3
## 3   North    185     3
## 4   North    199     3
## 5   North    206     3
## 6   North    214     3
## 7   Point    172     3
## 8   Point    178     3
## 9   Point    185     3
## 10  Point    199     3
## 11  Point    206     2
## 12  Point    214     3
## 13  South    172     3
## 14  South    178     3
## 15  South    185     3
## 16  South    199     3
## 17  South    206     2
## 18  South    214     3
# Takes the sample data we just connected to the phyloseq file, in the form of a dataframe,
# Groups it by each combination of sample site and date
# reports the total number of samples we have for each site/date combo - we have 2-3 samples for each.
variables <- expand.grid(Site=c("North","South"),Date=c("172","178","185","199","206","214"))
variables
##     Site Date
## 1  North  172
## 2  South  172
## 3  North  178
## 4  South  178
## 5  North  185
## 6  South  185
## 7  North  199
## 8  South  199
## 9  North  206
## 10 South  206
## 11 North  214
## 12 South  214
# Creating a matrix with the different combinations of variables for Site and date
# We are creating a function called Dif_Abund that will run deseq on each subset of factors (day and Site), 
# and return the factors, OTU ID, base mean, log2-fold change, the SE, and the p value associated with the response.

# This is testing for the question, on each date, are there differences between sample sites?

Dif_Abund = function(Site,Date){
  biom.pruned = prune_samples(sample_data(biom)$Date == paste(Date),biom)
  biom.pruned = prune_samples((sample_data(biom.pruned)$Site == paste(Site) | sample_data(biom.pruned)$Site == "Point"),biom.pruned)
  # Pulling out the samples we want to compare - we will run DESeq for each date. So, we pull out the rows from the site we are interested in,
  # and then also the baseline location (North) for that day, from our biom object.
  biom.pruned = prune_taxa(taxa_sums(biom.pruned) > 0, biom.pruned)
  # Cutting out any OTUs that don't show up in this comparison
  taxonomy <- data.frame(tax_table(biom.pruned))
  # Storing the taxonomy data
  dseq = phyloseq_to_deseq2(biom.pruned, ~Site)
  # Creating the dseq file, and using the formula that tells it we are interested in Site as a factor, and date as the baseline.
  dseq$Site = relevel(dseq$Site,"Point")
  # Establishing that the baseline comparison is for the first day.
  dseq = DESeq(dseq, quiet = TRUE, fitType = "local")
  # Running the actual DESeq function
  results = results(dseq, cooksCutoff=TRUE)
  # Reporting the results from the dseq object we just created
  # You can change Cooks Cutoff to control outliers (TRUE) or not (FALSE)
  results$Date = Date
  results$Site = Site
  # Creates a column in the results dataframe listing the Site
  results = data.frame(results$Site,results$Date,rownames(results),results$baseMean,results$log2FoldChange,results$lfcSE,results$pvalue,taxonomy[,1:7])
  # Pulling out all the results we are actually interested in
  colnames(results) = c("Site","Date","OTU","baseMean","l2FC","SE","pvalue","Kingdom","Phylum","Class","Order","Family","Genus","Species")
  # Naming the columns appropriately
  results
  # Return the final results dataframe
}
DA <- mdply(variables, Dif_Abund)
# Runs the  differential abundance function we created above on all the combinations of variables
# We are leaving out the first variable, because that is the comparison baseline.
# The formula, as written above, will not tell us if the sites are different from one to the next
# It is looking for OTUs that are increased in relative abundance (compared to the first sampling day)
# across all sites, but accounting for the fact that the sites have different baselines, basically.
threshold = function (thresh){
  filter(DA, baseMean >= thresh) %>% 
    mutate(padj = p.adjust(pvalue,"BH")) %>%
    summarize(cutoff=thresh, count=sum(padj<=0.10, na.rm = TRUE))
}
# Creates a function called "threshold" that will take our DA table as created above, then filter it so that we only keep
# rows where the baseMean (roughly the relative abundance for that OTU) is greater than some threshold, "thresh"
# Then, for those remaining rows, we run a Benjamini-Hochberg correction on our p-values, outputting these adjusted
# values in a new colum ("padj")
# Finally, we report back the threshold, and then the sum of taxa for which we have adjusted p values
# less than or equal to 0.10, our chosen false discovery rate
range = seq(0,3,0.25)
# Creates a sequence of numbers we are interested in for adjusted p values (from 0-3 by 0.25 increments)
thresh.s <- ldply(range, threshold)
# Applys the Threshold function we created above to the range of numbers we created above.

plot(thresh.s$count~thresh.s$cutoff)

# We can plot the threshold for base Mean value against the number of samples that will pass under this cutoff.
# We can see the optimum value to use here (here, 1.5)
l2fc <- group_by(DA, Site, Date) %>%
  filter(baseMean>=1.5) %>% 
  mutate(padj = p.adjust(pvalue,"BH"))
# We take that differential abundance table we created above ("DA"), and take the sum of the baseMean for each date
# Then we filter the whole set of values to include only those OTUs for which their baseMean was >1.5,
# We then adjust the p values only for those that we expect might be significant.
# I.e., those of very low relative abundance in the first place are not getting assessed at all,
# Allowing us to not test their significance - so untestested values will have NA in the padj column.
dim(l2fc[is.na(l2fc$padj)==TRUE,])[1]/dim(l2fc[])[1]
## [1] 0.007051813
# Fraction of OTUs that were not tested for significance (padj is "NA")
# We didn't actually cut it down that much, but it might help.
cutoff = 1
FDR = 0.1

d = l2fc %>%
  group_by(Site,Date)%>%
  mutate(Sig = ifelse(padj<FDR&l2FC>=cutoff,1,0))%>%
  mutate(Sig = ifelse(is.na(padj)==TRUE,0,Sig))%>%
  group_by(Site,Date)%>%
  count(Site,Date,Sig)%>%
  group_by(Site,Date)%>%
  mutate(Fraction=n/sum(n))
d
## Source: local data frame [24 x 5]
## Groups: Site, Date [12]
## 
##      Site   Date   Sig     n   Fraction
##    <fctr> <fctr> <dbl> <int>      <dbl>
## 1   North    172     0  2355 0.94426624
## 2   North    172     1   139 0.05573376
## 3   North    178     0  1754 0.84570878
## 4   North    178     1   320 0.15429122
## 5   North    185     0  1746 0.84063553
## 6   North    185     1   331 0.15936447
## 7   North    199     0  1909 0.83289703
## 8   North    199     1   383 0.16710297
## 9   North    206     0  3034 0.95528967
## 10  North    206     1   142 0.04471033
## # ... with 14 more rows
# Creates a little table showing which fraction of OTUs were designated as significant or not on each date
# A good number are - 10-30%
FDR = 0.1
d = l2fc

d = group_by(d, Site,Date) %>%
  mutate(sig = ifelse(padj<FDR,1,0))%>%
  mutate(Total=sum(baseMean)) %>%
  mutate(relabund=baseMean/Total)%>%
  filter(padj != 'NA')
# Creates a column designating whether (1) or not (0) the padj is lower than our false discovery rate (set above)
# We also calculate a new relative abundance value using the average across samples
# and we cut out those rows with no p-values

PhylumFraction = 0.005
OTUFraction = 0.001
PhylumList = d %>%
  group_by(Phylum,Date)%>%
  filter(sum(relabund)>=PhylumFraction | relabund>=OTUFraction)
PhylumList = levels(droplevels(PhylumList$Phylum))
# Looks at each phylum, on each date, and sees if the total abundance of OTUs within that phylum is
# greater than the cutoff (PhylumFraction, defined above), OR if the single OTU makes up a large enough fraciton
# of the total community on its own (OTUFraction, defined above)
# Thus, we have selected the phyla we want to plot.

d = filter(d, Phylum %in% PhylumList)

max.l2FC = ddply(d, .(Phylum), summarize, M = max(l2FC))
d$Phylum = factor(d$Phylum, max.l2FC[order(-max.l2FC$M),]$Phylum)
# makes a dataframe with the maximum value of log2Fold change for each phylum
# takes our phylum column, and arranges it in order of our log2FoldChange values

d$sig = as.factor(d$sig)
d.yes = d[d$sig==1,]
d.no = d[d$sig==0,]
# Creates new columns (true/false) for whether it is significant or not
p = ggplot(d)
p = p + geom_point(data=d.yes, aes(x = Phylum, y = l2FC, fill = Phylum, size=relabund), shape = 21, alpha=0.4, position = position_jitter(w = 0.20))
# p = p + geom_point(data=d.no, aes(x = Date, y = l2FC, fill = sig, size=relabund), shape = 21, alpha=0.3, position = position_jitter(w = 0.20))
# Plotting the significant and the nonsignificant points differently
# size is proportional to relative abundance, and we colour it by Class

p = p + scale_size_continuous("log(Relative\nAbundance)",trans="log",guide="none")
# Transforming the size of the points so it's better visualized

p = p + facet_grid(Date~Site, scales="free_x")
# Saying we want it to present the data separately for each phylum

p = p + geom_hline(yintercept = 1, linetype=2)
p = p + geom_hline(yintercept = -1, linetype=2)
p = p + geom_hline(yintercept = 3.3219, linetype=3)
p = p + geom_hline(yintercept = -3.3219, linetype=3)
p = p + geom_hline(yintercept = 0.0, linetype=1)
# Puts in horizontal lines at reference values

p = p + theme_bw()
# Sets a theme

p = p + theme(strip.text.x = element_text(size = 16),
              strip.text.y = element_text(size = 16),
              axis.text.x = element_text(size = 14, angle = 45, hjust = 1, vjust = 1, face="italic"),
              axis.title.x = element_text(size = 18),
              axis.text.y = element_text(size=16),
              axis.title.y = element_text(size = 18),
              legend.title = element_text(size=18),
              legend.text = element_text(size = 14),
              #legend.position = "none",
              strip.background = element_blank(),
              panel.grid.major = element_blank(),
              panel.grid.minor = element_blank())
# sets a bunch of visual paramters for the legend (none) and other text

p = p + labs(y = expression(paste("", log[2]," fold change vs. Point site",sep="")))
# sets the label for the y axes.

p

dProteo = d %>%
  filter(Phylum=="Proteobacteria")
d.yesProteo = d.yes %>%
  filter(Phylum=="Proteobacteria")
d.noProteo = d.no %>%
  filter(Phylum=="Proteobacteria")
# Subsetting our data for Proteos

p = ggplot(dProteo)
p = p + geom_point(data=d.yesProteo, aes(x = Order, y = l2FC, fill = Order, size=relabund), shape = 21, alpha=0.4, position = position_jitter(w = 0.20))
# p = p + geom_point(data=d.noProteo, aes(x = Date, y = l2FC, fill = sig, size=relabund), shape = 21, alpha=0.3, position = position_jitter(w = 0.20))
# Plotting the significant and the nonsignificant points differently
# size is proportional to relative abundance, and we colour it by Class

p = p + scale_size_continuous("log(Relative\nAbundance)",trans="log",guide="none")
# Transforming the size of the points so it's better visualized

p = p + facet_grid(Date~Site~Class, scales="free_x")
# Saying we want it to present the data separately for each phylum

p = p + geom_hline(yintercept = 1, linetype=2)
p = p + geom_hline(yintercept = -1, linetype=2)
p = p + geom_hline(yintercept = 3.3219, linetype=3)
p = p + geom_hline(yintercept = -3.3219, linetype=3)
p = p + geom_hline(yintercept = 0.0, linetype=1)
# Puts in horizontal lines at reference values

p = p + theme_bw()

p = p + theme(strip.text.x = element_text(size = 16),
              strip.text.y = element_text(size = 16),
              axis.text.x = element_text(size = 14, angle = 45, hjust = 1, vjust = 1, face="italic"),
              axis.title.x = element_text(size = 18),
              axis.text.y = element_text(size=16),
              axis.title.y = element_text(size = 18),
              legend.title = element_text(size=18),
              legend.text = element_text(size = 14),
              #legend.position = "none",
              strip.background = element_blank(),
              panel.grid.major = element_blank(),
              panel.grid.minor = element_blank())
# sets a bunch of visual paramters for the legend (none) and other text

p = p + labs(y = expression(paste("", log[2]," fold change vs. Point site",sep="")))
# sets the label for the y axes.

p + theme(legend.position="none")

dBacter = d %>%
  filter(Phylum=="Bacteroidetes")
d.yesBacter = d.yes %>%
  filter(Phylum=="Bacteroidetes")
d.noBacter = d.no %>%
  filter(Phylum=="Bacteroidetes")

p = ggplot(dBacter)
p = p + geom_point(data=d.yesBacter, aes(x = Class, y = l2FC, fill = Class, size=relabund), shape = 21, alpha=0.4, position = position_jitter(w = 0.20))
#p = p + geom_point(data=d.noBacter, aes(x = Class, y = l2FC, fill = Class, size=relabund), shape = 21, alpha=0.3, position = position_jitter(w = 0.20))
# Plotting the significant and the nonsignificant points differently
# size is proportional to relative abundance, and we colour it by Class

p = p + scale_size_continuous("log(Relative\nAbundance)",trans="log",guide="none")
# Transforming the size of the points so it's better visualized

p = p + facet_grid(Date~Site, scales="free_x")
# Saying we want it to present the data separately for each phylum

p = p + geom_hline(yintercept = 1, linetype=2)
p = p + geom_hline(yintercept = -1, linetype=2)
p = p + geom_hline(yintercept = 3.3219, linetype=3)
p = p + geom_hline(yintercept = -3.3219, linetype=3)
p = p + geom_hline(yintercept = 0.0, linetype=1)
# Puts in horizontal lines at reference values

p = p + theme_bw()
# Sets a theme

p = p + theme(strip.text.x = element_text(size = 16),
              strip.text.y = element_text(size = 16),
              axis.text.x = element_text(size = 14, angle = 45, hjust = 1, vjust = 1, face="italic"),
              axis.title.x = element_text(size = 18),
              axis.text.y = element_text(size=16),
              axis.title.y = element_text(size = 18),
              legend.title = element_text(size=18),
              legend.text = element_text(size = 14),
              #legend.position = "none",
              strip.background = element_blank(),
              panel.grid.major = element_blank(),
              panel.grid.minor = element_blank())
# sets a bunch of visual paramters for the legend (none) and other text

p = p + labs(y = expression(paste("", log[2]," fold change vs. Point site",sep="")))
# sets the label for the y axes.

p + theme(legend.position="none")

# Find methanotrophs
methanolist <- read.table(file = "methanos.txt")
methanolist <- as.vector(methanolist$V1)
# Create a list of all the possible methanotroph genera
df = l2fc %>%
  filter(Genus %in% as.factor(methanolist))
# Take out only the methanotrophs
df = group_by(df, Site,Date) %>%
  mutate(sig = ifelse(padj<FDR,1,0))%>%
  mutate(Total=sum(baseMean)) %>%
  mutate(relabund=baseMean/Total)%>%
  filter(padj != 'NA')
# Creates a column designating whether (1) or not (0) the padj is lower than our false discovery rate (set above)
# We also calculate a new relative abundance value using the average across samples
# and we cut out those rows with no p-values

df$sig = as.factor(df$sig)
df.yes = df[df$sig==1,]
df.no = df[df$sig==0,]
# Creates new columns (true/false) for whether it is significant or not
p = ggplot(df)

p = p + geom_point(data=df.yes, aes(x = Date, y = l2FC, fill = Site, size=relabund), shape = 21, alpha=0.4, position = position_jitter(w = 0.20))
p = p + geom_point(data=df.no, aes(x = Date, y = l2FC, fill = sig, size=relabund), shape = 21, alpha=0.3, position = position_jitter(w = 0.20))
# Plotting the significant and the nonsignificant points differently
# size is proportional to relative abundance, and we colour it by Class

p = p + scale_size_continuous("log(Relative\nAbundance)",trans="log",guide="none")
# Transforming the size of the points so it's better visualized

p = p + facet_grid(~Genus, scales="free_x")
# Saying we want it to present the data separately for each phylum

p = p + geom_hline(yintercept = 1, linetype=2)
p = p + geom_hline(yintercept = -1, linetype=2)
p = p + geom_hline(yintercept = 3.3219, linetype=3)
p = p + geom_hline(yintercept = -3.3219, linetype=3)
p = p + geom_hline(yintercept = 0.0, linetype=1)
# Puts in horizontal lines at reference values

p = p + theme_bw()

p = p + theme(strip.text.x = element_text(size = 16),
              strip.text.y = element_text(size = 16),
              axis.text.x = element_text(size = 14, angle = 45, hjust = 1, vjust = 1, face="italic"),
              axis.title.x = element_text(size = 18),
              axis.text.y = element_text(size=16),
              axis.title.y = element_text(size = 18),
              legend.title = element_text(size=18),
              legend.text = element_text(size = 14),
              #legend.position = "none",
              strip.background = element_blank(),
              panel.grid.major = element_blank(),
              panel.grid.minor = element_blank())
# sets a bunch of visual paramters for the legend (none) and other text

p = p + labs(y = expression(paste("", log[2]," fold change vs. Point site",sep="")))
# sets the label for the y axes

p

mdf = psmelt(biom)
#"melting" the phyloseq data into a dataframe

# We want to calculate the total relative abundance of each phylum
# and then take the top X most abundant phyla

SortedPhyla = mdf %>%
  group_by(Sample,Phylum) %>%
  summarize(PhyAbund = sum(Abundance))%>%
  # Should give us the relative abundance of all OTUs in each phylum from each sample.
  group_by(Phylum)%>%
  summarize(MeanPhyAbund = mean(PhyAbund))%>%
  arrange(-MeanPhyAbund)
# Then sums the mean relative abundance of each phylum, across samples, and reports it from most to least
nPhyla = 13
# How many phyla do we want to include (plus 1 for NAs)?
PhylumList = SortedPhyla[1:nPhyla,1]
PhylumList = PhylumList[is.na(PhylumList)==FALSE,]
PhylumList = levels(droplevels(as.factor(PhylumList$Phylum)))
# List of nPhyla top most abundant phyla
PhylumList
##  [1] "[Thermi]"         "Acidobacteria"    "Actinobacteria"  
##  [4] "Armatimonadetes"  "Bacteroidetes"    "Cyanobacteria"   
##  [7] "Gemmatimonadetes" "GN02"             "Planctomycetes"  
## [10] "Proteobacteria"   "SR1"              "Verrucomicrobia"
biom2 = subset_taxa(biom, Phylum %in% PhylumList)
mdf2 = psmelt(biom2)
head(mdf2)
##                         OTU Sample Abundance TreatmentGroup  Site Date
## 231063      HQ178949.1.1333 C206N3     51716           Late North  206
## 2001312 New.ReferenceOTU480 C172P3     46742          Early Point  172
## 2001291 New.ReferenceOTU480 C206N2     28094           Late North  206
## 2012320 New.ReferenceOTU806 C206N2     23022           Late North  206
## 2018804 New.ReferenceOTU992 C178S2     21465          Early South  178
## 2001273 New.ReferenceOTU480 C172P2     20245          Early Point  172
##                               Description SampleID.1  DateSite  Kingdom
## 231063  Sample of day 206 at site North 3     C206N3 206 North Bacteria
## 2001312 Sample of day 172 at site Point 3     C172P3 172 Point Bacteria
## 2001291 Sample of day 206 at site North 2     C206N2 206 North Bacteria
## 2012320 Sample of day 206 at site North 2     C206N2 206 North Bacteria
## 2018804 Sample of day 178 at site South 2     C178S2 178 South Bacteria
## 2001273 Sample of day 172 at site Point 2     C172P2 172 Point Bacteria
##                 Phylum               Class           Order          Family
## 231063  Proteobacteria Gammaproteobacteria   Aeromonadales  Aeromonadaceae
## 2001312  Cyanobacteria         Chloroplast   Stramenopiles            <NA>
## 2001291  Cyanobacteria         Chloroplast   Stramenopiles            <NA>
## 2012320       [Thermi]          Deinococci       Thermales      Thermaceae
## 2018804 Proteobacteria Gammaproteobacteria Methylococcales Crenotrichaceae
## 2001273  Cyanobacteria         Chloroplast   Stramenopiles            <NA>
##               Genus Species
## 231063         <NA>    <NA>
## 2001312        <NA>    <NA>
## 2001291        <NA>    <NA>
## 2012320 Meiothermus    <NA>
## 2018804  Crenothrix    <NA>
## 2001273        <NA>    <NA>
mdf2 = mdf2 %>%
  group_by(Sample,Site,Date,Phylum) %>%
  summarize(PhyAbund = sum(Abundance))
p = ggplot(mdf2, aes(Date, PhyAbund, fill = Date))

p = p + geom_boxplot()

p = p + facet_wrap(~Phylum, scales = "free_y")

#p = p + scale_fill_manual(values=c("orange","gold1","red3"))

p = p + theme_bw()

p = p + guides(fill = "none")
p = p + theme(legend.position = "none")

p = p + theme(strip.text.x = element_text(size=11), 
              strip.text.y = element_text(size=9), 
              strip.background = element_rect(colour="white", fill="white"))

p = p + theme(axis.text.x = element_text(size = 9, angle = 45, hjust=1),axis.text.y = element_text(size = 9))
p = p + theme(axis.title.x = element_blank())
p = p + theme(axis.title.y = element_text(size = 15, vjust = 1))

p = p + labs(x="Amendment",y="Relative abundance")

p